Dane o jakosci powietrza pobrane zostaly za pomoca pakietu giosimport.
setwd("C:/Users/Admin/Desktop/pds/projekt")
kat_dost <- "C:/Users/Admin/Desktop/pds/projekt"
meta <- gios_metadane(type = "meta",
download = F, # zmieĆ na T, jeĆli uruchamiasz piewszy raz
path = kat_dost,
mode = "wb")
stanowiska <- gios_metadane(type = "stand",
download = F,
path = kat_dost,
mode = "wb")
#aktywne stacje jakoĆci powietrza zachodniopomorskie
filtr1 <- meta %>%
filter(status == "aktywny")%>%
filter(wojewodztwo == "ZACHODNIOPOMORSKIE")
gios_vis(data = filtr1) # mapkaTab. 1. Aktywne stacje monitoringu jakoĆci powietrza w wojewĂłdztwie zachodniopomorskim.
filtr2 <- stanowiska %>%
filter(status.stanowiska == "aktywny")%>%
filter(wojewodztwo == "ZACHODNIOPOMORSKIE")%>%
filter(typ.pomiaru == "automatyczny")%>%
filter(wskaznik == "pyĆ zawieszony PM10"| wskaznik == "pyĆ zawieszony PM2.5")
tabelka2=filtr2[,-c(1,2,3,6,8,9,10,12,14,15)]#tabelka
DT::datatable(tabelka2)Tab.2. Lista stacji na ktĂłrych byĆy wykonywane pomiary automatyczne stÄĆŒeĆ PM10 i PM2.5 w 2018 r.
filtr2 <- stanowiska %>%
filter(status.stanowiska == "aktywny")%>%
filter(wojewodztwo == "ZACHODNIOPOMORSKIE")%>%
filter(typ.pomiaru == "automatyczny")%>%
filter(wskaznik == "pyĆ zawieszony PM10"| wskaznik == "pyĆ zawieszony PM2.5")%>%
pull(kod.stacji)
gios_vis(data = meta %>%
filter(kod.stacji %in% filtr2)) Rys. 2. Graficzne przedstawie lokalizacji aktywnych stacji jakoĆci powietrza na ktĂłrych byĆy wykonywane pomiary automatyczne stÄĆŒeĆ PM10 i PM2.5 w wojewĂłdztwie zachodniopomorskim.
filtr3 <- stanowiska %>%
filter(status.stanowiska == "aktywny")%>%
filter(wojewodztwo == "ZACHODNIOPOMORSKIE")%>%
filter(typ.pomiaru == "manualny")%>%
filter(wskaznik == "pyĆ zawieszony PM10"| wskaznik == "pyĆ zawieszony PM2.5")
tabelka3=filtr3[,-c(1,2,3,6,8,9,10,12,14,15)] #tabelka
DT::datatable(tabelka3)Tab. 3. Lista stacji na ktĂłrych byĆy wykonywane pomiary automatyczne stÄĆŒeĆ PM10 i PM2.5 w roku 2018
filtr3 <- stanowiska %>%
filter(status.stanowiska == "aktywny")%>%
filter(wojewodztwo == "ZACHODNIOPOMORSKIE")%>%
filter(typ.pomiaru == "manualny")%>%
filter(wskaznik == "pyĆ zawieszony PM10"| wskaznik == "pyĆ zawieszony PM2.5")%>%
pull(kod.stacji)
gios_vis(data = meta %>%
filter(kod.stacji %in% filtr3)) Rys. 3. Graficzne przedstawie lokalizacji aktywnych stacji jakoĆci powietrza na ktĂłrych byĆy wykonywane pomiary manualne stÄĆŒeĆ PM10 i PM2.5 w wojewĂłdztwie zachodniopomorskim.
KorzystajÄ c z pakietu giosimport pobraliĆmy dane jakoĆci powietrza dla lat 2008-2017. Pobrane dane zĆÄ czyliĆmy i zapisaliĆmy do pliku .RData. NastÄpnie wyselekcjonowaliĆmy dane dla wojewĂłdztwa zachodniopomorskiego.
load(file = "C:/Users/Admin/Desktop/pds/projekt/pm1h_raw.RData")
pm1h <- pm1h_raw
rm(pm1h_raw)
pm <- unique(pm1h$kod)
pm[str_detect(pm, "^Zp")] -> kody_zp
pm1h <- filter(pm1h, kod == kody_zp)
pm1h <- gios_kody(data = pm1h, meta = meta)
pm1h %>%
filter(sub == "PM10" | sub == "PM2.5") %>%
mutate(rok = year(date), miesiac = month(date), godzina = hour(date)) -> pm1h
load(file = "C:/Users/Admin/Desktop/pds/projekt/pm24h_raw.RData")
pm24h <- pm24h_raw
rm(pm24h_raw)
pm <- unique(pm24h$kod)
pm[str_detect(pm, "^Zp")] -> kody_zp
pm24h <- filter(pm24h, kod == kody_zp)
pm24h <- gios_kody(data = pm24h, meta = meta)
pm24h %>%
filter(sub == "PM10" | sub == "PM2.5") %>%
mutate(rok = year(date), miesiac = month(date), godzina = hour(date)) -> pm24hRmisc::summarySE(data = pm1h,
measurevar = "obs",
groupvars = c("sub","kod"),
na.rm = T) %>%
mutate_if(is.numeric, round, 2) -> wynik_1h
Rmisc::summarySE(data = pm24h,
measurevar = "obs",
groupvars = c("sub","kod"),
na.rm = T) %>%
mutate_if(is.numeric,round,2) -> wynik_24h
rbind(data.frame(wynik_1h, czas_mu = "1g"),
data.frame(wynik_24h, czas_mu = "24g")) -> wynik_RMISC
position_d <- position_dodge(0.4)
knitr::kable(wynik_1h, digits = 2)| sub | kod | N | obs | sd | se | ci |
|---|---|---|---|---|---|---|
| PM10 | ZpGryfGryfinoDO | 1131 | 22.06 | 23.13 | 0.69 | 1.35 |
| PM10 | ZpGryfMarwiceDO | 1480 | 16.97 | 13.26 | 0.34 | 0.68 |
| PM10 | ZpGryfStokiDO | 1459 | 13.84 | 12.27 | 0.32 | 0.63 |
| PM10 | ZpKoszArKraj | 2411 | 24.39 | 17.79 | 0.36 | 0.71 |
| PM10 | ZpStarLipnikDO | 922 | 18.11 | 15.91 | 0.52 | 1.03 |
| PM10 | ZpSzczAndr01 | 3735 | 24.34 | 20.12 | 0.33 | 0.65 |
| PM10 | ZpSzczecinDO | 689 | 34.85 | 32.27 | 1.23 | 2.41 |
| PM10 | ZpSzczecPrze | 2869 | 27.30 | 22.92 | 0.43 | 0.84 |
| PM10 | ZpSzczLacz04 | 3538 | 22.11 | 16.15 | 0.27 | 0.53 |
| PM2.5 | ZpSzczAndr01 | 3179 | 17.40 | 16.07 | 0.28 | 0.56 |
| PM2.5 | ZpSzczPils02 | 4549 | 19.76 | 17.18 | 0.25 | 0.50 |
| sub | kod | N | obs | sd | se | ci |
|---|---|---|---|---|---|---|
| PM10 | ZpKoszalinWSSE | 30 | 15.90 | 11.36 | 2.07 | 4.24 |
| PM10 | ZpKoszArKraj | 47 | 24.66 | 16.65 | 2.43 | 4.89 |
| PM10 | ZpKoszSpasow | 125 | 22.75 | 11.65 | 1.04 | 2.06 |
| PM10 | ZpMyslZaBram | 62 | 28.96 | 18.90 | 2.40 | 4.80 |
| PM10 | ZpSwinoujscieWSSE | 13 | 15.00 | 8.63 | 2.39 | 5.22 |
| PM10 | ZpSzcSzczecinek009 | 77 | 31.41 | 26.55 | 3.03 | 6.03 |
| PM10 | ZpSzcSzczecinekPSSE | 30 | 26.03 | 19.59 | 3.58 | 7.32 |
| PM10 | ZpSzczAndr01 | 126 | 23.02 | 15.48 | 1.38 | 2.73 |
| PM10 | ZpSzczec1Maj | 122 | 27.21 | 19.54 | 1.77 | 3.50 |
| PM10 | ZpSzczecinWSSE | 11 | 22.82 | 10.15 | 3.06 | 6.82 |
| PM10 | ZpSzczecPrze | 46 | 23.14 | 12.60 | 1.86 | 3.74 |
| PM10 | ZpSzczLacz04 | 91 | 22.99 | 15.53 | 1.63 | 3.24 |
| PM10 | ZpSzczPils02 | 143 | 26.89 | 13.62 | 1.14 | 2.25 |
| PM10 | ZpWiduBulRyb | 141 | 24.32 | 16.27 | 1.37 | 2.71 |
| PM2.5 | ZpKoszSpasow | 107 | 13.89 | 12.32 | 1.19 | 2.36 |
| PM2.5 | ZpMyslZaBram | 101 | 21.58 | 18.88 | 1.88 | 3.73 |
| PM2.5 | ZpSzczAndr01 | 106 | 16.31 | 13.46 | 1.31 | 2.59 |
| PM2.5 | ZpSzczec1Maj | 106 | 17.92 | 13.94 | 1.35 | 2.68 |
ggplot(wynik_RMISC %>%
filter(sub == "PM10") %>%
mutate(kod = str_sub(kod,3,30)),
aes(x = kod,
y = obs,
colour = factor(czas_mu),
group = factor(czas_mu),
fill = factor(czas_mu)))+
geom_errorbar(aes(ymin = obs-ci,
ymax = obs+ci),
width = .2,
position = position_d)+
geom_point(position = position_d,
size = 1,
shape = 21)+
labs(x = "kod stacji",
y = openair::quickText("StÄĆŒenie PM10 [ug/m3]"),
title = "Zestawienie stÄĆŒeĆ Ćredniorocznych pyĆĂłw zawieszonych PM10",
color = "czas_mu", fill = "czas_mu")+
theme_bw()+
theme(legend.justification = c(0.95,0.95),
legend.position = c(0.95,0.95),
axis.text.x = element_text(angle = 45, vjust = 0.4))ggplot(wynik_RMISC %>%
filter(sub == "PM2.5") %>%
mutate(kod = str_sub(kod,3,30)),
aes(x = kod,
y = obs,
colour = factor(czas_mu),
group = factor(czas_mu),
fill = factor(czas_mu)))+
geom_errorbar(aes(ymin = obs-ci,
ymax = obs+ci),
width = .2,
position = position_d)+
geom_point(position = position_d,
size = 1,
shape = 21)+
labs(x = "kod stacji",
y = openair::quickText("StÄĆŒenie PM2.5 [ug/m3]"),
title = "Zestawienie stÄĆŒeĆ Ćredniorocznych pyĆĂłw zawieszonych PM2.5",
color = "czas_mu", fill = "czas_mu")+
theme_bw()+
theme(legend.justification = c(0.95,0.95),
legend.position = c(0.95,0.95),
axis.text.x = element_text(angle = 0, vjust = 0.4))Definiowanie funkcji pomocniczych.
fun_mean <- function(d, i) {
return(mean(d[i]))
}
fun_ci <- function(x, R = 1000, conf = 0.95) {
# szacujemy rozkĆad empiryczny statystyki
my.boot <- boot(data = x,
statistic = fun_mean,
R = R)
# wyznaczamy przedziaĆ ufnoĆci
ci <- boot.ci(boot.out = my.boot,
conf = conf,
type = "basic")$basic[4:5]
# zapisujemy interesujÄ
ce nas wyniki do wektora
ci <- data.frame(orginal = my.boot$t0,
lower = ci[1],
upper = ci[2])
return(ci)
}
pm1h %>%
na.omit() %>%
group_by(kod, sub) %>%
do(ci = fun_ci(x = .$obs, R = 1000, conf = 0.95)) -> ci_pm1
pm24h %>%
na.omit() %>%
group_by(kod, sub) %>%
do(ci = fun_ci(x = .$obs, R = 1000, conf = 0.95)) -> ci_pm24
rbind(data.frame(ci_pm1 %>% unnest(cols = ci), czas_mu = "1g"),
data.frame(ci_pm24 %>% unnest(cols = ci), czas_mu = "24g")) -> wyn_bootpd <- position_dodge(width = 0.8)
ggplot(wyn_boot %>%
filter(sub == "PM10") %>%
mutate(kod = str_sub(kod, 1, 30)),
aes(x = kod,
y = orginal,
colour = factor(czas_mu),
group = factor(czas_mu),
fill = factor(czas_mu))) +
geom_errorbar(aes(ymin = lower,
ymax = upper,
colour = czas_mu),
width = .6, position = pd) +
geom_point(size = 2,
shape = 21,
position = pd) +
labs(x = "Kod Stacji",
y = openair::quickText("StÄĆŒenie PM10 [ug/m3]"),
color = "Czas uĆredniania", fill = "Czas uĆredniania") +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ggtitle("Zestawienie stÄĆŒeĆ Ćredniorocznych pyĆĂłw zawieszonych PM10")ggplot(wyn_boot %>%
filter(sub == "PM2.5") %>%
mutate(kod = str_sub(kod, 1, 30)),
aes(x = kod,
y = orginal,
colour = factor(czas_mu),
group = factor(czas_mu),
fill = factor(czas_mu))) +
geom_errorbar(aes(ymin = lower,
ymax = upper,
colour = czas_mu),
width = .6, position = pd) +
geom_point(size = 2,
shape = 21,
position = pd) +
labs(x = "Kod Stacji",
y = openair::quickText("StÄĆŒenie PM2,5 [ug/m3]"),
color = "Czas uĆredniania", fill = "Czas uĆredniania") +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ggtitle("Zestawienie stÄĆŒeĆ Ćredniorocznych pyĆĂłw zawieszonych PM2.5")wynik_RMISC %>%mutate(lower = obs - ci,
upper = obs + ci,
type = "rmisc") -> wynik_RMISC_mutate
wynik_RMISC_mutate <- rename(wynik_RMISC_mutate, c("obs" = "orginal"))
rbind(wyn_boot %>%
mutate(type = "boot"),
wynik_RMISC_mutate %>%
select(kod, sub, orginal, lower, upper, czas_mu, type)
) -> wynik_bind_uf
position_d <- position_dodge(0.4)
ggplot(wynik_bind_uf %>%
filter(sub == "PM10",czas_mu == "1g") %>%
mutate(kod = str_sub(kod, 3,30)),
aes(x = kod,
y = orginal,
colour = factor(type),
group = factor(type),
fill = factor(type)))+
geom_errorbar(aes(ymin = lower,
ymax = upper),
width = .2,
position = position_d)+
geom_point(position = position_d,
size = 1,
shape = 21)+
labs(x = "Kod stacji",
y = openair::quickText("StÄĆŒenie PM10[ug/m3]"),
color = "type", fill = "type")+
theme_bw()+
theme(legend.justification = c(0.95, 0.95),
legend.position = c(0.95, 0.95),
axis.text.x = element_text(angle = 45, vjust = 0.4))+
ggtitle("WartoĆci Ćrednioroczne stÄĆŒeĆ PM10 - stacje automatyczne")ggplot(wynik_bind_uf %>%
filter(sub == "PM2.5",czas_mu == "1g") %>%
mutate(kod = str_sub(kod, 3,30)),
aes(x = kod,
y = orginal,
colour = factor(type),
group = factor(type),
fill = factor(type)))+
geom_errorbar(aes(ymin = lower,
ymax = upper),
width = .2,
position = position_d)+
geom_point(position = position_d,
size = 1,
shape = 21)+
labs(x = "Kod stacji",
y = openair::quickText("StÄĆŒenie PM2.5[ug/m3]"),
color = "type", fill = "type")+
theme_bw()+
theme(legend.justification = c(0.95, 0.95),
legend.position = c(0.95, 0.95),
axis.text.x = element_text(angle = 0, vjust = 0.4))+
ggtitle("WartoĆci Ćrednioroczne stÄĆŒeĆ PM2.5 - stacje automatyczne ")position_d <- position_dodge(0.4)
ggplot(wynik_bind_uf %>%
filter(sub == "PM10",czas_mu == "24g") %>%
mutate(kod = str_sub(kod, 3,30)),
aes(x = kod,
y = orginal,
colour = factor(type),
group = factor(type),
fill = factor(type)))+
geom_errorbar(aes(ymin = lower,
ymax = upper),
width = .2,
position = position_d)+
geom_point(position = position_d,
size = 1,
shape = 21)+
labs(x = "Kod stacji",
y = openair::quickText("StÄĆŒenie PM10[ug/m3]"),
color = "type", fill = "type")+
theme_bw()+
theme(legend.justification = c(0.95, 0.95),
legend.position = c(0.95, 0.95),
axis.text.x = element_text(angle = 45, vjust = 0.4))+
ggtitle("WartoĆci Ćrednioroczne stÄĆŒeĆ PM10 - stacje manualne")ggplot(wynik_bind_uf %>%
filter(sub == "PM2.5",czas_mu == "24g") %>%
mutate(kod = str_sub(kod, 3,30)),
aes(x = kod,
y = orginal,
colour = factor(type),
group = factor(type),
fill = factor(type)))+
geom_errorbar(aes(ymin = lower,
ymax = upper),
width = .2,
position = position_d)+
geom_point(position = position_d,
size = 1,
shape = 21)+
labs(x = "Kod stacji",
y = openair::quickText("StÄĆŒenie PM2.5[ug/m3]"),
color = "type", fill = "type")+
theme_bw()+
theme(legend.justification = c(0.95, 0.95),
legend.position = c(0.95, 0.95),
axis.text.x = element_text(angle = 0, vjust = 0.4))+
ggtitle("WartoĆci Ćrednioroczne stÄĆŒeĆ PM2.5- stacje manualne")Korelacja pomiedzy PM10 i PM2.5 wraz z przedzialem ufnosci na poziomie 95%.
Deklaracja funkcji:
fun_r <- function(d, i, x, y, metoda) {
d2 <- d[i,]
cor(d2[,x], d2[,y], method = metoda)
}
fun_ci_r <- function(data, x, y, R = 500, conf = 0.95, metoda_corr = "pearson") {
my.boot <- boot(data = data,
statistic = fun_r,
R = R,
x = x,
y = y,
metoda = metoda_corr)
ci <- boot.ci(boot.out = my.boot,
conf = conf,
type = "basic")$basic[4:5]
ci <- data.frame(orginal = my.boot$t0,
lower = ci[1],
upper = ci[2])
return(ci)
}pm1h_corr <- pm1h %>%
spread(key = sub, value = obs) %>%
na.omit() %>%
group_by(kod) %>%
nest()
pm1h_corr <- pm1h_corr %>%
mutate(kor = map(.x = data,
.f = fun_ci_r,
x = "PM10",
y = "PM2.5"))
pm1h_corr <- pm1h_corr %>%
unnest(cols = kor)names(pm1h_corr)[names(pm1h_corr) == "PM2.5"] <- "R"
ggplot(pm1h_corr, aes(x = kod, y = R)) +
geom_errorbar(aes(ymin = lower,
ymax = upper),
width = .2) +
geom_point(size = 2.5,
shape = 21,
fill = "black") +
geom_line(group = 1, color = "grey") +
labs(x = "Kod stacji",
y = openair::quickText("WspĂłĆczynnik Korelacji Pearsona"))W przypadku wojewĂłdztwa Zachodnio Pomorskiego korelacje mozemy policzyc wylacznie dla 1 stacji znajdujacej sie w Szczecinie. Z wykresy mozna odczytac, ze korelacja jest bardzo wysoka i wynosi 0.925.
pm1h_corr <- pm1h %>%
spread(key = sub, value = obs) %>%
na.omit() %>%
group_by(miesiac, kod) %>%
nest()
pm1h_corr <- pm1h_corr %>%
mutate(kor = map(.x = data,
.f = fun_ci_r,
x = "PM10",
y = "PM2.5"))
pm1h_corr <- pm1h_corr %>%
unnest(cols = kor)names(pm1h_corr)[names(pm1h_corr) == "PM2.5"] <- "R"
ggplot(pm1h_corr, aes(x = factor(miesiac), y = R)) +
geom_errorbar(aes(ymin = lower,
ymax = upper),
width = .2) +
geom_point(size = 2.5,
shape = 21,
fill = "black") +
geom_line(group = 1, color = "grey") +
labs(x = "MiesiÄ
c",
y = openair::quickText("WspĂłĆczynnik Korelacji Pearsona ze wzglÄdu na miesiÄ
c")) +
facet_wrap(~pm1h_corr$kod)Z wykresu mozna odczytac, ze korelacj pomiedzy pylem PM10 i PM2.5 w okresie zimowym jest wyzsza niz w pozostalych. W sierpniu korelacja spada nawet ponizej 0.5.
pm1h_corr <- pm1h %>%
spread(key = sub, value = obs) %>%
na.omit() %>%
group_by(godzina, kod) %>%
nest()
pm1h_corr <- pm1h_corr %>%
mutate(kor = map(.x = data,
.f = fun_ci_r,
x = "PM10",
y = "PM2.5"))
pm1h_corr <- pm1h_corr %>%
unnest(cols = kor)names(pm1h_corr)[names(pm1h_corr) == "PM2.5"] <- "R"
ggplot(pm1h_corr, aes(x = factor(godzina), y = R)) +
geom_errorbar(aes(ymin = lower,
ymax = upper),
width = .2) +
geom_point(size = 2.5,
shape = 21,
fill = "black") +
geom_line(group = 1, color = "grey") +
labs(x = "Godzina",
y = openair::quickText("WspĂłĆczynnik Korelacji Pearsona ze wzglÄdu na godzinÄ")) +
facet_wrap(~pm1h_corr$kod)Widzimy, ze dla godziny 11 korelacja jest nizsza niz dla pozostalych godzin oznaczonych na wykresi. Mimo to utrzymuje sie na wyskokim poziomie
Dane o jakoĆci powietrza wczytaliĆmy dla stacji Koszalin_ArmiiKrajowej.
kat_dost <- "C:/Users/Admin/Desktop/pds/projekt"
inp_pm10_1<- map(.x=pliki_all,
.f=~.[str_detect(.,pattern="PM10_1g")])%>%
flatten_chr()
PM10<- map_df(.x=inp_pm10_1,
.f=gios_read,
czas_mu="1g",
path=kat_dost)#78911
#"ZpSzczecinLukasza" "ZpSzczAndr01" "ZpSzczPils02" "ZpSzczLacz04" "ZpKoszArKraj" "ZpGryfGryfinoDO"
#"ZpGryfMarwiceDO" "ZpGryfStokiDO" "ZpStarLipnikDO" "ZpSzczecinDO" "ZpSzczecPrze"
koszalin <- PM10 %>%
filter(kod=="ZpKoszArKraj")Dane meteorologiczne wczytaliĆmy dla stacji âKOSZALINâ, byĆa ona najbliĆŒej powyĆŒszej stacji jakoĆci powietrza.
ggplot(data = koszalin, aes(x = date, y = obs)) +
geom_line() +
ggtitle("Wykres szeregu czasowego danych dla stacji Koszalin")ggplot(data = koszalin %>%
timeAverage(avg.time = "day"),
aes(x = date, y = obs)) +
geom_line()+
ggtitle("Ćrednie dniowe")ggplot(data = koszalin %>%
timeAverage(avg.time = "week"),
aes(x = date, y = obs)) +
geom_line()+
ggtitle("Ćrednie tygodniowe")ggplot(data = koszalin %>%
timeAverage(avg.time = "month"),
aes(x = date, y = obs)) +
geom_line()+
ggtitle("Ćrednie miesieczne")Na powyĆŒszych wykresach widac sezonowosc i trend naszych danych.
koszalin_m %>%
gather(key = "parametr", value = "obserwacje") %>%
gg_season()+
ggtitle("Wykres podserii dla wszystkich parametrĂłw")Na powyĆŒszym wykresie widac, ze na przestrzeni 4 lat nie widac znaczacych trendow w naszych predyktorach.
Wykresy rozrzutu.
koszalin_m %>%
ggplot(aes(x = obs, y = ws)) +
geom_point() +
theme_bw()+
ggtitle("Wykres rozrzutu miedzy predkoscia wiatru a PM10")koszalin_m %>%
ggplot(aes(x = obs, y = visibility)) +
geom_point() +
theme_bw()+
ggtitle("Wykres rozrzutu miedzy widocznoscia a PM10")koszalin_m %>%
ggplot(aes(x = obs, y = air_temp)) +
geom_point() +
theme_bw()+
ggtitle("Wykres rozrzutu miedzy temperatura powietrza a PM10")koszalin_m %>%
ggplot(aes(x = obs, y = atmos_pres)) +
geom_point() +
theme_bw()+
ggtitle("Wykres rozrzutu miedzy cisnieniem atmosferycznym a PM10")koszalin_m %>%
ggplot(aes(x = obs, y = dew_point)) +
geom_point() +
theme_bw()+
ggtitle("Wykres rozrzutu miedzy punktem rosy a PM10")koszalin_m %>%
ggplot(aes(x = obs, y = RH)) +
geom_point() +
theme_bw()+
ggtitle("Wykres rozrzutu miedzy wilgotnoscia wzgledna a PM10")Analizujac wykresy rozrzutu widac duze rozproszenie pomiedzy danymi, wiec na razie ciezko jest stwierdzic korelacje miedzy danymi.
koszalin_m %>%
gather(key = "parametr", value = "obserwacje") %>%
ggplot(aes(x = date, y = obserwacje)) +
geom_line() +
facet_wrap(~parametr, scales = "free_y", ncol = 1)+
ggtitle("Wykres serii czasowej dla wszystkich parametrow")Widac, ze temperatura powietrza i punkt rosy sa prawie identyczne.
Dla PM10 najlepsza korelacja jest miedzy temperatura powietrza, punktem rosy i widocznoscia. Wysoka korelacja pomiedzy predyktorami: punktem rosy i temperatura powietrza, widocznosc i wilgotnosc wzgledna, temperatura powietrza i widoczoscia.
koszalin_m %>% mutate(obs = log10(obs)) %>%
GGally::ggpairs(columns = 2:ncol(.))+
ggtitle("Log10(obs)")#log10(obs) poprawia korelacje miedzy obs a wszystkimi predyktorami oprĂłcz cisnienia atmosf.
koszalin_m %>% mutate(obs = log10(obs),
ws = log10(ws)) %>%
GGally::ggpairs(columns = 2:ncol(.))+
ggtitle("log10(obs),log10(ws) ") #nie pomoglokoszalin_m %>% mutate(ws = log10(ws)) %>%
GGally::ggpairs(columns = 2:ncol(.))+
ggtitle("log10(ws) ")#delikatnie poprawilokoszalin_m %>% mutate(obs = obs^2) %>%
GGally::ggpairs(columns = 2:ncol(.))+
ggtitle("obs^2") #ws lepiej, reszta gorszakoszalin_m %>% mutate(ws = ws^2) %>%
GGally::ggpairs(columns = 2:ncol(.))+
ggtitle("ws^2") #brak zmiankoszalin_m %>% mutate(ws = ws^2,
obs = obs^2) %>%
GGally::ggpairs(columns = 2:ncol(.))+
ggtitle("ws^2, obs^2")Funkcja logarytmiczna polepsza korelacje miedzy zmiennymi, ktĂłre nas interesuja. Funkcja kwadratowa raczej pogarsza.
Najlepsza korelacja wystepuje na 12 opoznieniu. Tutaj ladnie widac sezonowosc cykliczna roczna naszych danych.
Autokorelacja
Widac sezonowosc.
Widac sezonowosc i trend PM10.
grid.arrange(koszalin_m %>% ACF(ws, lag_max = 120) %>% autoplot() + ggtitle("predkosc wiatru"),
koszalin_m %>% ACF(air_temp, lag_max = 120) %>% autoplot() + ggtitle("temeratura"),
koszalin_m %>% ACF(RH, lag_max = 120) %>% autoplot() + ggtitle("wilgotnosc"),
koszalin_m %>% ACF(atmos_pres, lag_max = 120) %>% autoplot() + ggtitle("cisnienie atmosferyczne"),
koszalin_m %>% ACF(visibility, lag_max = 120) %>% autoplot() + ggtitle("widocznosc"),
koszalin_m %>% ACF(dew_point, lag_max = 120) %>% autoplot() + ggtitle("punkt rosy"),
ncol = 2)Temperatura , punkt rosy i widocznosc sa dobrymi predyktorami, poniewaz widac sezonowosc i malejacy trend.
Transformacje
l_m <- koszalin_m %>%
features(obs, features = guerrero) %>%
pull(lambda_guerrero)
grid.arrange(koszalin_m %>% autoplot() + ggtitle("orginal + obs"),
koszalin_m %>% autoplot(box_cox(obs, l_m)) + ggtitle("boxcox + obs"))l_m <- koszalin_m %>%
features(ws, features = guerrero) %>%
pull(lambda_guerrero)
grid.arrange(koszalin_m %>% autoplot(ws) + ggtitle("orginal + ws"),
koszalin_m %>% autoplot(box_cox(ws, l_m)) + ggtitle("boxcox + ws"))Transformacja boxacoxa niewiele zmienia, nie otrzymalismy zwiekszonej jednorodnosci wariancji.
Nasze dane treningowe sa w latach 2015-2018, natomiast dane testowe sa z 2018 roku.
list(model1 = obs~ air_temp,
model2 = log10(obs)~ dew_point,
model3 = log10(obs)~ visibility,
model4 = log10(obs)~ air_temp,
model5 = log(obs)~ air_temp,
model6 = obs^2~ ws) -> formy
map(formy, as.formula) -> formy
out <- map(.x = formy,
.f = ~model(tran, TSLM(formula = .x)) %>% glance()) %>%
do.call(rbind, .) %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC)
out %>%
mutate(.model = formy) %>%
arrange(AIC) %>%
knitr::kable(digits = 2)| .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| log10(obs) ~ visibility | 0.66 | 0.01 | -181.38 | -180.63 | -176.63 |
| log10(obs) ~ dew_point | 0.63 | 0.01 | -177.69 | -176.94 | -172.94 |
| log10(obs) ~ air_temp | 0.59 | 0.01 | -174.20 | -173.45 | -169.45 |
| log(obs) ~ air_temp | 0.59 | 0.04 | -114.15 | -113.40 | -109.39 |
| obs ~ air_temp | 0.54 | 30.40 | 124.18 | 124.93 | 128.93 |
| obs^2 ~ ws | -0.03 | 213843.83 | 444.91 | 445.66 | 449.66 |
Najlepszym modelem jest model po transormacji logarytmicznej, gdzie predyktorem jest widocznosc.
fit1 <- tran %>%
model(TSLM(formula = log10(obs) ~ visibility))
fit2 <- tran %>%
model(TSLM(formula = log10(obs) ~ dew_point))
fit3 <- tran %>%
model(TSLM(formula = log10(obs) ~ air_temp))Raport modelu f1: log10(obs) ~ visibility
## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11953 -0.05369 -0.00556 0.05016 0.15398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.711e+00 4.371e-02 39.149 < 2e-16 ***
## visibility -1.574e-05 1.881e-06 -8.368 9.05e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07623 on 34 degrees of freedom
## Multiple R-squared: 0.6732, Adjusted R-squared: 0.6636
## F-statistic: 70.03 on 1 and 34 DF, p-value: 9.0502e-10
Raport modelu f2: log10(obs) ~ dew_point
## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.154534 -0.052434 -0.008747 0.065588 0.148560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.465164 0.018945 77.338 < 2e-16 ***
## dew_point -0.019169 0.002477 -7.738 5.31e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08024 on 34 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6272
## F-statistic: 59.88 on 1 and 34 DF, p-value: 5.3104e-09
Raport modelu f3: log10(obs) ~ air_temp
## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.17080 -0.06008 -0.01020 0.06676 0.17254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.510279 0.025108 60.151 < 2e-16 ***
## air_temp -0.016150 0.002257 -7.155 2.84e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08424 on 34 degrees of freedom
## Multiple R-squared: 0.6009, Adjusted R-squared: 0.5892
## F-statistic: 51.2 on 1 and 34 DF, p-value: 2.842e-08
PowyĆŒsze raporty modeli dostarczaja informacje, ĆŒe we wszystkich modelach stala modelu i zmienna predykcyjna jest istotna statystycznie.
Podglad dopasowania modelu: log10(obs) ~ visibility
fit1 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p1;p1Na wykresie widac, ze nasz model dobrze oddaje sezonowosc danych oryginalnych. Najwieksze bledy w dopasowaniu wystepuja w wartosciach ekstremalnych.
Podglad dopasowania modelu: log10(obs) ~ dew_point
fit2 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p2;p2Podglad dopasowania modelu: log10(obs) ~ air_temp
fit3 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p3;p3Reszty modelu log10(obs) ~ visibility
Reszty modelu log10(obs) ~ dew_point
Reszty modelu log10(obs) ~ air_temp
We wszystkich modelach nie wystpuje autokorelacja reszt. Nie wystepuja problemy z jednorodnoscia wariancji. Rozklady reszt sa podobne do rozkladu normalnego. Najlepiej w dane oryginalne wpisuje sie model f1.
list(model1 = obs ~ trend() + season(),
model2 = obs ~ air_temp + trend(),
model3 = obs ~ dew_point + trend(),
model4 = obs ~ visibility + trend(),
model5 = log(obs) ~ air_temp + trend(),
model6 = log10(obs) ~ air_temp + trend(),
model7 = obs^2 ~ ws + trend(),
model8 = obs ~ air_temp + trend() + season(),
model9 = obs ~ dew_point + trend() + season(),
model10 = obs ~ visibility + trend() + season(),
model11 = log(obs) ~ air_temp + trend() + season(),
model12 = log10(obs) ~ air_temp + trend() + season(),
model13 = obs^2 ~ ws + trend() + season(),
model14 = obs ~ air_temp + season(),
model15 = obs ~ dew_point + season(),
model16 = obs ~ visibility + season(),
model17 = log(obs) ~ air_temp + season(),
model18 = log10(obs) ~ air_temp + season(),
model19 = obs^2 ~ ws + season(),
model20 = log10(obs) ~ trend() + season(),
model21 = log10(obs) ~ visibility + trend() + season(),
model22 = log10(obs) ~ dew_point + trend() + season()) -> formy1
map(formy1, as.formula) -> formy1
out1 <- map(.x = formy1,
.f = ~model(tran, TSLM(formula = .x)) %>% glance()) %>%
do.call(rbind, .) %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC)
out1 %>%
mutate(.model = formy1) %>%
arrange(AIC) %>%
knitr::kable(digits = 2)| .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| log10(obs) ~ visibility + trend() + season() | 0.74 | 0.01 | -181.73 | -157.73 | -157.97 |
| log10(obs) ~ dew_point + trend() + season() | 0.73 | 0.01 | -181.64 | -157.64 | -157.89 |
| log10(obs) ~ trend() + season() | 0.66 | 0.01 | -173.00 | -153.00 | -150.83 |
| log10(obs) ~ air_temp + trend() | 0.58 | 0.01 | -172.60 | -171.31 | -166.26 |
| log10(obs) ~ air_temp + trend() + season() | 0.65 | 0.01 | -172.05 | -148.05 | -148.30 |
| log10(obs) ~ air_temp + season() | 0.65 | 0.01 | -171.56 | -151.56 | -149.39 |
| log(obs) ~ air_temp + trend() | 0.58 | 0.04 | -112.55 | -111.26 | -106.21 |
| log(obs) ~ air_temp + trend() + season() | 0.65 | 0.05 | -112.00 | -88.00 | -88.24 |
| log(obs) ~ air_temp + season() | 0.65 | 0.05 | -111.51 | -91.51 | -89.34 |
| obs ~ dew_point + trend() + season() | 0.71 | 29.83 | 115.52 | 139.52 | 139.27 |
| obs ~ visibility + trend() + season() | 0.70 | 32.36 | 117.15 | 141.15 | 140.91 |
| obs ~ visibility + season() | 0.69 | 31.08 | 117.48 | 137.48 | 139.65 |
| obs ~ visibility + trend() | 0.62 | 26.21 | 118.26 | 119.55 | 124.60 |
| obs ~ dew_point + season() | 0.67 | 33.04 | 120.11 | 140.11 | 142.28 |
| obs ~ dew_point + trend() | 0.58 | 29.07 | 121.45 | 122.74 | 127.78 |
| obs ~ air_temp + trend() | 0.53 | 32.88 | 125.99 | 127.28 | 132.32 |
| obs ~ air_temp + trend() + season() | 0.60 | 39.93 | 127.10 | 151.10 | 150.86 |
| obs ~ air_temp + season() | 0.60 | 39.28 | 127.23 | 147.23 | 149.40 |
| obs ~ trend() + season() | 0.60 | 38.44 | 127.24 | 147.24 | 149.41 |
| obs^2 ~ ws + trend() + season() | 0.54 | 150228.58 | 424.32 | 448.32 | 448.08 |
| obs^2 ~ ws + season() | 0.53 | 148283.31 | 424.76 | 444.76 | 446.93 |
| obs^2 ~ ws + trend() | -0.06 | 224884.41 | 446.89 | 448.18 | 453.22 |
Mamy 6 dobrych modeli. Wszystkie sa z funkcja log10(). Na trzecim miejscu jest model bez predykotra uwzgledniajacy tylko trend i sezonowosc.
Analiza modelu: log10(obs) ~ visibility + trend() + season()
## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.10219 -0.05207 0.00722 0.04266 0.08927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.712e+00 9.146e-02 18.723 5.27e-15 ***
## visibility -1.659e-05 6.004e-06 -2.763 0.0114 *
## trend() 1.505e-03 1.152e-03 1.306 0.2051
## season()year2 5.049e-02 5.528e-02 0.914 0.3709
## season()year3 3.190e-02 5.784e-02 0.552 0.5868
## season()year4 6.548e-02 9.554e-02 0.685 0.5003
## season()year5 3.837e-02 1.087e-01 0.353 0.7274
## season()year6 -5.952e-02 1.114e-01 -0.534 0.5984
## season()year7 -6.117e-02 1.084e-01 -0.564 0.5783
## season()year8 2.941e-02 1.156e-01 0.254 0.8016
## season()year9 5.812e-03 1.030e-01 0.056 0.9555
## season()year10 -3.657e-02 6.244e-02 -0.586 0.5640
## season()year11 -8.566e-02 5.890e-02 -1.454 0.1600
## season()year12 -9.741e-02 6.432e-02 -1.515 0.1441
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06758 on 22 degrees of freedom
## Multiple R-squared: 0.8338, Adjusted R-squared: 0.7356
## F-statistic: 8.489 on 13 and 22 DF, p-value: 8.0561e-06
fit4 %>% glance() %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC) %>%
knitr::kable(digits = 2) | .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| TSLM(log10(obs) ~ visibility + trend() + season()) | 0.74 | 0.01 | -181.73 | -157.73 | -157.97 |
fit4 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p4;p4Analiza modelu: log10(obs) ~ dew_point + trend() + season()
## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1048841 -0.0359005 0.0001054 0.0401365 0.0865309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.402861 0.051942 27.008 <2e-16 ***
## dew_point -0.029760 0.010820 -2.751 0.0117 *
## trend() 0.002418 0.001178 2.052 0.0522 .
## season()year2 0.076816 0.056672 1.355 0.1890
## season()year3 0.087647 0.066828 1.312 0.2032
## season()year4 -0.023036 0.072017 -0.320 0.7521
## season()year5 0.064298 0.117267 0.548 0.5890
## season()year6 0.071737 0.154989 0.463 0.6480
## season()year7 0.137119 0.174681 0.785 0.4408
## season()year8 0.205143 0.174778 1.174 0.2531
## season()year9 0.158882 0.153178 1.037 0.3109
## season()year10 0.147602 0.109863 1.344 0.1928
## season()year11 0.036500 0.083511 0.437 0.6663
## season()year12 -0.063334 0.071171 -0.890 0.3832
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06766 on 22 degrees of freedom
## Multiple R-squared: 0.8334, Adjusted R-squared: 0.735
## F-statistic: 8.466 on 13 and 22 DF, p-value: 8.2395e-06
fit5 %>% glance() %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC) %>%
knitr::kable(digits = 2) | .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| TSLM(log10(obs) ~ dew_point + trend() + season()) | 0.73 | 0.01 | -181.64 | -157.64 | -157.89 |
fit5 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p5;p5Analiza modelu: log10(obs) ~ trend() + season()
## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.150735 -0.046727 0.005389 0.045307 0.130870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.487559 0.047426 31.366 < 2e-16 ***
## trend() 0.001726 0.001305 1.323 0.198940
## season()year2 0.042189 0.062648 0.673 0.507387
## season()year3 -0.015592 0.062688 -0.249 0.805787
## season()year4 -0.149759 0.062756 -2.386 0.025632 *
## season()year5 -0.219933 0.062851 -3.499 0.001932 **
## season()year6 -0.326252 0.062973 -5.181 2.98e-05 ***
## season()year7 -0.318293 0.063122 -5.043 4.20e-05 ***
## season()year8 -0.250407 0.063297 -3.956 0.000627 ***
## season()year9 -0.233269 0.063498 -3.674 0.001260 **
## season()year10 -0.112038 0.063726 -1.758 0.092026 .
## season()year11 -0.132825 0.063979 -2.076 0.049250 *
## season()year12 -0.181742 0.064258 -2.828 0.009528 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07671 on 23 degrees of freedom
## Multiple R-squared: 0.7761, Adjusted R-squared: 0.6593
## F-statistic: 6.644 on 12 and 23 DF, p-value: 5.5406e-05
fit6 %>% glance() %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC) %>%
knitr::kable(digits = 2) | .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| TSLM(log10(obs) ~ trend() + season()) | 0.66 | 0.01 | -173 | -153 | -150.83 |
fit6 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p6;p6Analiza modelu: log10(obs) ~ air_temp + trend() + season()
## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.138751 -0.047484 0.002974 0.047807 0.122570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.487804 0.047795 31.129 <2e-16 ***
## air_temp -0.009031 0.011226 -0.804 0.4297
## trend() 0.001655 0.001318 1.256 0.2225
## season()year2 0.059030 0.066514 0.887 0.3844
## season()year3 0.027852 0.083110 0.335 0.7407
## season()year4 -0.090817 0.096787 -0.938 0.3583
## season()year5 -0.101281 0.160515 -0.631 0.5346
## season()year6 -0.182418 0.189721 -0.962 0.3467
## season()year7 -0.161159 0.205421 -0.785 0.4411
## season()year8 -0.083631 0.216901 -0.386 0.7035
## season()year9 -0.098894 0.178872 -0.553 0.5859
## season()year10 -0.027227 0.123444 -0.221 0.8275
## season()year11 -0.082260 0.090043 -0.914 0.3709
## season()year12 -0.144721 0.079442 -1.822 0.0821 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07731 on 22 degrees of freedom
## Multiple R-squared: 0.7825, Adjusted R-squared: 0.654
## F-statistic: 6.089 on 13 and 22 DF, p-value: 0.00011357
fit7 %>% glance() %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC) %>%
knitr::kable(digits = 2) | .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| TSLM(log10(obs) ~ air_temp + trend() + season()) | 0.65 | 0.01 | -172.05 | -148.05 | -148.3 |
fit7 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p7;p7W modelach gdzie zostay uzyte predyktory: visibility, dew point i air_temp dodawanie sezonowosci i trendu nie ma sensu, poniewaz po zbudowaniu modelu nie sa one istotne statystycznie. Modele regresji liniowej prostej z sezonowoscia i trendem moja wyzszy wspolczynnik R kwadrat, lecz ich wada jest to, ze nie wszystkie zmienne uzyte do zbudowania modelu sa istotne statystycznie. Najlepsze dopasowanie uzyskuja modele f4 i f5. Do prognozowania uĆŒyjemy modelu f4.
list(model1 = obs ~ air_temp + ws + atmos_pres + RH,
model2 = obs ~ dew_point + ws + atmos_pres + visibility,
model3 = obs ~ air_temp + ws + atmos_pres + RH + trend(),
model4 = obs ~ air_temp + ws + atmos_pres + RH + season(),
model5 = obs ~ air_temp + ws + atmos_pres + RH + trend() + season(),
model6 = log10(obs) ~ air_temp + ws + atmos_pres + RH,
model7 = log10(obs) ~ dew_point + ws + atmos_pres + visibility,
model8 = log10(obs) ~ air_temp + ws + atmos_pres + RH + trend(),
model9 = log10(obs) ~ air_temp + ws + atmos_pres + RH + season(),
model9 = log10(obs) ~ air_temp + ws + atmos_pres + RH + season() + trend()) -> formy2
map(formy2, as.formula) -> formy2Tylko w jednym przypadku model oparty na niezmienionych danych uzyskal lepsze dopasowanie od modeli opartych na danych zlogarytmowanych.
out2 <- map(.x = formy2,
.f = ~model(tran, TSLM(formula = .x)) %>% glance()) %>%
do.call(rbind, .) %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC)
out2 %>%
mutate(.model = formy2) %>%
arrange(AIC) %>%
knitr::kable(digits = 2)| .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| log10(obs) ~ air_temp + ws + atmos_pres + RH + season() + trend() | 0.85 | 0.00 | -201.90 | -161.67 | -173.40 |
| log10(obs) ~ dew_point + ws + atmos_pres + visibility | 0.75 | 0.01 | -189.11 | -186.21 | -179.61 |
| log10(obs) ~ air_temp + ws + atmos_pres + RH + trend() | 0.72 | 0.01 | -184.72 | -180.72 | -173.64 |
| log10(obs) ~ air_temp + ws + atmos_pres + RH | 0.69 | 0.01 | -181.80 | -178.90 | -172.30 |
| log10(obs) ~ air_temp + ws + atmos_pres + RH + season() | 0.74 | 0.01 | -181.20 | -147.20 | -154.28 |
| obs ~ air_temp + ws + atmos_pres + RH + trend() + season() | 0.86 | 15.48 | 89.66 | 129.90 | 118.17 |
| obs ~ dew_point + ws + atmos_pres + visibility | 0.73 | 19.88 | 108.29 | 111.19 | 117.79 |
| obs ~ air_temp + ws + atmos_pres + RH + trend() | 0.70 | 22.93 | 112.23 | 116.23 | 123.32 |
| obs ~ air_temp + ws + atmos_pres + RH + season() | 0.73 | 31.68 | 113.69 | 147.69 | 140.61 |
| obs ~ air_temp + ws + atmos_pres + RH | 0.67 | 23.50 | 114.45 | 117.34 | 123.95 |
fit8 <- tran %>%
model(TSLM(obs ~ air_temp + ws + atmos_pres + RH + trend() + season()))
fit8 %>% report()## Series: obs
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.84911 -0.97995 -0.04862 1.09546 4.39270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -635.05063 179.15905 -3.545 0.002165 **
## air_temp -1.04180 0.49086 -2.122 0.047170 *
## ws 0.74048 1.47185 0.503 0.620682
## atmos_pres 0.71588 0.17224 4.156 0.000536 ***
## RH -0.75477 0.18303 -4.124 0.000578 ***
## trend() 0.24852 0.05536 4.489 0.000251 ***
## season()year2 3.14624 2.51409 1.251 0.225962
## season()year3 -2.56032 3.28793 -0.779 0.445745
## season()year4 -11.99226 3.84859 -3.116 0.005688 **
## season()year5 -12.11047 6.37345 -1.900 0.072698 .
## season()year6 -9.38741 7.65450 -1.226 0.235034
## season()year7 -5.36511 8.40872 -0.638 0.531062
## season()year8 -6.92834 8.93635 -0.775 0.447706
## season()year9 -5.67544 7.55722 -0.751 0.461857
## season()year10 -2.97396 5.43527 -0.547 0.590639
## season()year11 -2.11792 3.72007 -0.569 0.575806
## season()year12 -11.36684 3.55698 -3.196 0.004760 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.9 on 19 degrees of freedom
## Multiple R-squared: 0.9255, Adjusted R-squared: 0.8628
## F-statistic: 14.75 on 16 and 19 DF, p-value: 1.7742e-07
RĂłĆŒnice w dopasowaniu w miejscach wartosci ekstremalnych sa mniejsze niz w regresji liniowej.
fit8 %>% glance() %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC) %>%
knitr::kable(digits = 2) | .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| TSLM(obs ~ air_temp + ws + atmos_pres + RH + trend() + season()) | 0.86 | 15.48 | 89.66 | 129.9 | 118.17 |
fit8 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p8;p8Wykres rozkladu nie przypomina wykresu rozkladu normalnego.
W przypadku modelu opartego na danych zlogarytmowanych dopasowanie jest rownie dobre co w przypadku niezlogarytmowanych danych. Wykres rozkladu jest zblizony do rozkadu normalnego.
fit9 <- tran %>%
model(TSLM(log10(obs) ~ air_temp + ws + atmos_pres + RH + season() + trend()))
fit9 %>% report()## Series: obs
## Model: TSLM
## Transformation: log10(.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.093389 -0.014181 -0.001455 0.019357 0.078248
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.095304 3.122856 -2.592 0.017877 *
## air_temp -0.013050 0.008556 -1.525 0.143685
## ws 0.013889 0.025655 0.541 0.594555
## atmos_pres 0.010347 0.003002 3.446 0.002704 **
## RH -0.011738 0.003190 -3.679 0.001593 **
## season()year2 0.029055 0.043822 0.663 0.515288
## season()year3 -0.038150 0.057311 -0.666 0.513622
## season()year4 -0.196309 0.067083 -2.926 0.008662 **
## season()year5 -0.234650 0.111093 -2.112 0.048137 *
## season()year6 -0.240080 0.133423 -1.799 0.087859 .
## season()year7 -0.177750 0.146569 -1.213 0.240094
## season()year8 -0.176245 0.155766 -1.131 0.271937
## season()year9 -0.139951 0.131727 -1.062 0.301360
## season()year10 -0.061785 0.094740 -0.652 0.522114
## season()year11 -0.037162 0.064843 -0.573 0.573298
## season()year12 -0.183605 0.062001 -2.961 0.008019 **
## trend() 0.003944 0.000965 4.087 0.000628 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05056 on 19 degrees of freedom
## Multiple R-squared: 0.9197, Adjusted R-squared: 0.852
## F-statistic: 13.6 on 16 and 19 DF, p-value: 3.4933e-07
fit9 %>% glance() %>%
select(.model, adj_r_squared, CV, AIC, AICc, BIC) %>%
knitr::kable(digits = 2) | .model | adj_r_squared | CV | AIC | AICc | BIC |
|---|---|---|---|---|---|
| TSLM(log10(obs) ~ air_temp + ws + atmos_pres + RH + season() + | |||||
| trend()) | 0.85 | 0 | -201.9 | -161.67 | -173.4 |
fit9 %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p9;p9Roznice w dopasowaniu sa duze. Widac to szczegĂłlnie w styczniu 2017.
Wystepuje duza sezonowosc
## Series: obs
## Model: ETS(M,N,M)
## Smoothing parameters:
## alpha = 0.0001002167
## gamma = 0.0001000006
##
## Initial states:
## l s1 s2 s3 s4 s5 s6 s7
## 24.13163 0.9360705 1.013209 1.114605 0.7976731 0.7781777 0.6502307 0.6498614
## s8 s9 s10 s11 s12
## 0.8276024 0.9594083 1.254245 1.594173 1.424744
##
## sigma^2: 0.0331
##
## AIC AICc BIC
## 244.9618 268.9618 268.7146
Dopasowanie jest gorsze niz w przypadku wczesniejszych modeli
| .model | sigma2 | log_lik | AIC | AICc | BIC | MSE | AMSE | MAE |
|---|---|---|---|---|---|---|---|---|
| ETS(obs) | 0.03 | -107.48 | 244.96 | 268.96 | 268.71 | 17.27 | 15.64 | 0.11 |
Ponownie mozna zauwazyc ze wystepuje problem z dopasowaniem wartosci ekstremalnych.
fit_w %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p6;p6Pomimo gorszego dopasowanie wzgledem poprzednich modeli w tym przypadku tego modelu nie wystepuja problemy z wykresem reszt ani z rozkladem
## Series: obs
## Model: ARIMA(0,0,0)(1,1,0)[12]
##
## Coefficients:
## sar1
## -0.5607
## s.e. 0.1980
##
## sigma^2 estimated as 37.54: log likelihood=-79.31
## AIC=162.63 AICc=163.2 BIC=164.98
Slabe AIC i BIC.
| .model | sigma2 | log_lik | AIC | AICc | BIC | ar_roots | ma_roots |
|---|---|---|---|---|---|---|---|
| arima | 37.54 | -79.31 | 162.63 | 163.2 | 164.98 | 0.7420368+0.7420368i, -0.7420368+0.7420368i, -0.7420368-0.7420368i, 1.0136412-0.2716043i, 0.2716043+1.0136412i, -1.0136412+0.2716043i, -0.2716043-1.0136412i, 1.0136412+0.2716043i, -0.2716043+1.0136412i, -1.0136412-0.2716043i, 0.2716043-1.0136412i, 0.7420368-0.7420368i |
W przypadku tego modelu sytuacja z prognozowaniem wartosci ekstemalnych jest identyczna jak w przypadku wczesniejszych modeli.
fit_a %>%
augment() %>%
ggplot(aes(x = date)) +
geom_line(aes(y = .fitted, color = "model")) +
geom_line(aes(y = obs, color = "dane")) +
labs(color = "Reprezentacja") -> p_a;p_alist(model1 = log10(obs) ~ ws,
model2 = log10(obs) ~ ws + RH,
model3 = log10(obs) ~ ws + I(ws^2),
model4 = log10(obs) ~ ws + atmos_pres,
model5 = log10(obs) ~ ws + atmos_pres + RH,
model6 = log10(obs) ~ ws + I(ws^2) + atmos_pres + RH,
model7 = log10(obs) ~ air_temp + ws + atmos_pres + RH,
model8 = log10(obs) ~ ws + dew_point + atmos_pres,
model9 = obs ~ air_temp + ws + atmos_pres + RH,
model10 = obs ~ dew_point + atmos_pres + ws) -> formy_dyn
map(formy_dyn, as.formula) -> formy_dynDopasowanie w przypadku modeli dynamicznych jest bardzo slabe
out_dyn <- map(.x = formy_dyn,
.f = ~model(tran, ARIMA(formula = .x))
%>% glance()) %>%
do.call(rbind, .) %>%
select(.model, AIC, AICc, BIC)
out_dyn %>%
mutate(.model = formy_dyn) %>%
arrange(AIC) %>%
knitr::kable(digits = 2)| .model | AIC | AICc | BIC |
|---|---|---|---|
| log10(obs) ~ ws + dew_point + atmos_pres | -89.13 | -86.23 | -79.63 |
| log10(obs) ~ air_temp + ws + atmos_pres + RH | -86.63 | -82.63 | -75.54 |
| log10(obs) ~ ws + atmos_pres + RH | -53.69 | -48.75 | -46.62 |
| log10(obs) ~ ws + I(ws^2) + atmos_pres + RH | -53.22 | -46.22 | -44.98 |
| log10(obs) ~ ws + atmos_pres | -48.64 | -45.30 | -42.74 |
| log10(obs) ~ ws + I(ws^2) | -43.89 | -38.94 | -36.82 |
| log10(obs) ~ ws + RH | -42.96 | -39.63 | -37.07 |
| log10(obs) ~ ws | -41.61 | -39.50 | -36.89 |
| obs ~ dew_point + atmos_pres + ws | 205.73 | 208.62 | 215.23 |
| obs ~ air_temp + ws + atmos_pres + RH | 207.72 | 211.72 | 218.81 |
fit_d <- tran %>%
model(arima = ARIMA(log10(obs) ~ air_temp + ws + atmos_pres , stepwise = F, approximation = F))
fit_d %>% report()## Series: obs
## Model: LM w/ ARIMA(0,0,1)(1,0,0)[12] errors
## Transformation: log10(.x)
##
## Coefficients:
## ma1 sar1 air_temp ws atmos_pres
## 0.5387 -0.5335 -0.0196 -0.0840 0.0018
## s.e. 0.1270 NaN 0.0018 0.0153 NaN
##
## sigma^2 estimated as 0.004086: log likelihood=48.43
## AIC=-84.86 AICc=-81.96 BIC=-75.36
| .model | sigma2 | log_lik | AIC | AICc | BIC | ar_roots | ma_roots |
|---|---|---|---|---|---|---|---|
| arima | 0 | 48.43 | -84.86 | -81.96 | -75.36 | 0.7451194+0.7451194i, -0.7451194+0.7451194i, -0.7451194-0.7451194i, 1.0178521-0.2727326i, 0.2727326+1.0178521i, -1.0178521+0.2727326i, -0.2727326-1.0178521i, 1.0178521+0.2727326i, -0.2727326+1.0178521i, -1.0178521-0.2727326i, 0.2727326-1.0178521i, 0.7451194-0.7451194i | -1.856398+0i |
fc_1 <- forecast(fit1, test)
fc_1 %>%
autoplot(koszalin_m %>% filter(year(date) > 2010), level = 95) + xlab("Year") +
scale_color_brewer(type = "qual", palette = "Dark2")->p_1;p_1fc_4 <- forecast(fit4, test)
fc_4 %>%
autoplot(koszalin_m %>% filter(year(date) > 2010), level = 95) + xlab("Year") +
scale_color_brewer(type = "qual", palette = "Dark2")->p_4;p_4fc_wielo <- forecast(fit8,test)
fc_wielo %>%
autoplot(koszalin_m %>% filter(year(date) > 2010), level = 95) + xlab("Year") +
scale_color_brewer(type = "qual", palette = "Dark2") -> p_wielo ; p_wielofc_w <- fit_w %>% forecast(h = "12 months")
fc_w %>%
autoplot(koszalin_m %>% filter(year(date) > 2010), level = 95) + xlab("Year") +
scale_color_brewer(type = "qual", palette = "Dark2")->p_w; p_wfc_a <- fit_a %>% forecast(h = "12 months")
fc_a %>%
autoplot(koszalin_m %>% filter(year(date) > 2010), level = 95) + xlab("Year") +
scale_color_brewer(type = "qual", palette = "Dark2")->p_a;p_afc_d <- forecast(fit_d,test)
fc_d %>%
autoplot(koszalin_m %>% filter(year(date) > 2010), level = 95) + xlab("Year") +
scale_color_brewer(type = "qual", palette = "Dark2") -> p_d ; p_dgrid.arrange( p_1 + ggtitle('Regresja liniowa prosta'),
p_4 + ggtitle('Regresja liniowa z trendem i sezonowoscia'),
p_wielo + ggtitle('ETS'))grid.arrange( p_w + ggtitle('Model wykladniczy'),
p_a + ggtitle('ARIMA'),
p_d + ggtitle('Model dynamiczny'))bind_rows(
fit1%>%accuracy(),
fit4%>%accuracy(),
fit8%>%accuracy(),
fit_w%>%accuracy(),
fit_a%>%accuracy(),
fit_d%>%accuracy(),
fc_1%>%accuracy(test),
fc_4%>%accuracy(test),
fc_wielo%>%accuracy(test),
fc_w%>%accuracy(test),
fc_a%>%accuracy(test),
fc_d%>%accuracy(test))->dopasowanie
knitr::kable(dopasowanie, digits = 2 )| .model | .type | ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 |
|---|---|---|---|---|---|---|---|---|
| TSLM(formula = log10(obs) ~ visibility) | Training | 0.39 | 4.44 | 3.53 | -1.46 | 14.65 | 0.65 | 0.32 |
| TSLM(log10(obs) ~ visibility + trend() + season()) | Training | 0.21 | 3.10 | 2.47 | -0.75 | 10.26 | 0.45 | 0.18 |
| TSLM(obs ~ air_temp + ws + atmos_pres + RH + trend() + season()) | Training | 0.00 | 2.11 | 1.56 | -0.64 | 6.78 | 0.29 | -0.06 |
| ETS(obs) | Training | -0.06 | 4.16 | 2.90 | -2.19 | 12.03 | 0.53 | 0.21 |
| arima | Training | 0.69 | 4.90 | 3.01 | 0.77 | 11.81 | 0.55 | 0.23 |
| arima | Training | 0.41 | 3.57 | 2.78 | -0.65 | 11.43 | 0.51 | -0.02 |
| TSLM(formula = log10(obs) ~ visibility) | Test | 2.17 | 4.74 | 3.96 | 7.18 | 14.73 | NaN | 0.28 |
| TSLM(log10(obs) ~ visibility + trend() + season()) | Test | -0.05 | 2.80 | 2.26 | -0.41 | 8.84 | NaN | 0.01 |
| TSLM(obs ~ air_temp + ws + atmos_pres + RH + trend() + season()) | Test | -4.70 | 7.15 | 5.60 | -20.49 | 23.49 | NaN | 0.16 |
| ETS(obs) | Test | 1.99 | 5.29 | 4.21 | 6.24 | 15.20 | NaN | 0.08 |
| arima | Test | 2.04 | 5.57 | 4.32 | 6.09 | 15.64 | NaN | 0.03 |
| arima | Test | 2.81 | 5.23 | 4.44 | 11.28 | 17.70 | NaN | -0.06 |
Model regresji wielorakiej bardzo dobrze wpasowuje sie w dane treningowe jednak ma bardzo slabe wyniki przy prognozowaniu. Bardzo dobre parametry przy danych treningowych i najlepsze podczas testowania otrzymal model regresji liniowej uwzgledniajacy trend i sezonowosc. Zatem do prognozowania na naszym zbiorze danych najlepiej nadaje sie model: TSLM(log10(obs) ~ visibility + trend() + season())